# Run this cell if using Google colab and delete the first and last rows
from google.colab import drive
drive.mount('/content/drive')
drive.mount('/content/gdrive')
drive_path = '/content/gdrive/Shareddrives/Data_Mining/Dataset/'
Mounted at /content/drive Mounted at /content/gdrive
# Preliminary: libraries and data import
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append(drive_path)
import dmba
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Replace w/ the dataset here. Update working directory as necessary.
df = pd.read_csv(drive_path + "crx.data", header = None)
dmba
<module 'dmba' from '/content/gdrive/Shareddrives/Data_Mining/Dataset/dmba.py'>
# Seeing the data features
df
# Columns aren't named and the values are non-sensical.
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | b | 30.83 | 0.000 | u | g | w | v | 1.25 | t | t | 1 | f | g | 00202 | 0 | + |
| 1 | a | 58.67 | 4.460 | u | g | q | h | 3.04 | t | t | 6 | f | g | 00043 | 560 | + |
| 2 | a | 24.50 | 0.500 | u | g | q | h | 1.50 | t | f | 0 | f | g | 00280 | 824 | + |
| 3 | b | 27.83 | 1.540 | u | g | w | v | 3.75 | t | t | 5 | t | g | 00100 | 3 | + |
| 4 | b | 20.17 | 5.625 | u | g | w | v | 1.71 | t | f | 0 | f | s | 00120 | 0 | + |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 685 | b | 21.08 | 10.085 | y | p | e | h | 1.25 | f | f | 0 | f | g | 00260 | 0 | - |
| 686 | a | 22.67 | 0.750 | u | g | c | v | 2.00 | f | t | 2 | t | g | 00200 | 394 | - |
| 687 | a | 25.25 | 13.500 | y | p | ff | ff | 2.00 | f | t | 1 | t | g | 00200 | 1 | - |
| 688 | b | 17.92 | 0.205 | u | g | aa | v | 0.04 | f | f | 0 | f | g | 00280 | 750 | - |
| 689 | b | 35.00 | 3.375 | u | g | c | h | 8.29 | f | f | 0 | t | g | 00000 | 0 | - |
690 rows × 16 columns
# First, let's rename the columns to be able to see what we're looking at in each column
# Dictionary for renaming columns
d = {0: 'Gender', 1: 'Age', 2: 'Debt', 3: 'Married', 4: 'BankCustomer',
5: 'EducationLevel', 6: 'Ethnicity', 7: 'YearsEmployed', 8: 'PriorDefault',
9: 'Employed', 10: 'CreditScore', 11: 'DriversLicense', 12: 'Citizen',
13: 'ZipCode', 14: 'Income', 15: 'ApprovalStatus'}
# Rename columns using the dictionary
df.rename(columns=d, inplace=True)
# Dimensions of data
df.shape
(690, 16)
df.size # size of data frame
11040
df
| Gender | Age | Debt | Married | BankCustomer | EducationLevel | Ethnicity | YearsEmployed | PriorDefault | Employed | CreditScore | DriversLicense | Citizen | ZipCode | Income | ApprovalStatus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | b | 30.83 | 0.000 | u | g | w | v | 1.25 | t | t | 1 | f | g | 00202 | 0 | + |
| 1 | a | 58.67 | 4.460 | u | g | q | h | 3.04 | t | t | 6 | f | g | 00043 | 560 | + |
| 2 | a | 24.50 | 0.500 | u | g | q | h | 1.50 | t | f | 0 | f | g | 00280 | 824 | + |
| 3 | b | 27.83 | 1.540 | u | g | w | v | 3.75 | t | t | 5 | t | g | 00100 | 3 | + |
| 4 | b | 20.17 | 5.625 | u | g | w | v | 1.71 | t | f | 0 | f | s | 00120 | 0 | + |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 685 | b | 21.08 | 10.085 | y | p | e | h | 1.25 | f | f | 0 | f | g | 00260 | 0 | - |
| 686 | a | 22.67 | 0.750 | u | g | c | v | 2.00 | f | t | 2 | t | g | 00200 | 394 | - |
| 687 | a | 25.25 | 13.500 | y | p | ff | ff | 2.00 | f | t | 1 | t | g | 00200 | 1 | - |
| 688 | b | 17.92 | 0.205 | u | g | aa | v | 0.04 | f | f | 0 | f | g | 00280 | 750 | - |
| 689 | b | 35.00 | 3.375 | u | g | c | h | 8.29 | f | f | 0 | t | g | 00000 | 0 | - |
690 rows × 16 columns
# Count the number of '?' values in our dataset so they're not treated as a group by LabelEncoder and mess up our models
for col in df.columns:
count = (df[col] == '?').sum()
if count > 0:
print(f"Column {col}: ? Count: {count}")
Column Gender: ? Count: 12 Column Age: ? Count: 12 Column Married: ? Count: 6 Column BankCustomer: ? Count: 6 Column EducationLevel: ? Count: 9 Column Ethnicity: ? Count: 9 Column ZipCode: ? Count: 13
# Replace all the '?' values with NA's
df.replace('?', np.nan, inplace=True)
# Check if the '?' values are gone
for col in df.columns:
count = (df[col] == '?').sum()
if count > 0:
print(f"Column {col}:? Count: {count}")
else:
print(f"Column {col}: No ? values found.")
Column Gender: No ? values found. Column Age: No ? values found. Column Debt: No ? values found. Column Married: No ? values found. Column BankCustomer: No ? values found. Column EducationLevel: No ? values found. Column Ethnicity: No ? values found. Column YearsEmployed: No ? values found. Column PriorDefault: No ? values found. Column Employed: No ? values found. Column CreditScore: No ? values found. Column DriversLicense: No ? values found. Column Citizen: No ? values found. Column ZipCode: No ? values found. Column Income: No ? values found. Column ApprovalStatus: No ? values found.
# Print a plot of all the new missing values that were plugged in instead of '?'
null_values = df.isnull().sum()
plt.figure(figsize=(12, 6))
null_values.plot(kind="bar")
plt.ylabel("Number of Null Values")
plt.title("Total Count of Null Values per Column")
plt.xticks(rotation=45)
plt.show()
df.dtypes
Gender object Age object Debt float64 Married object BankCustomer object EducationLevel object Ethnicity object YearsEmployed float64 PriorDefault object Employed object CreditScore int64 DriversLicense object Citizen object ZipCode object Income int64 ApprovalStatus object dtype: object
# Convert age into a float type because it is being read as an object
df["Age"] = df["Age"].astype(float)
df.dtypes
Gender object Age float64 Debt float64 Married object BankCustomer object EducationLevel object Ethnicity object YearsEmployed float64 PriorDefault object Employed object CreditScore int64 DriversLicense object Citizen object ZipCode object Income int64 ApprovalStatus object dtype: object
# Replacing the NA values with mode and median
cat_cols = [0, 3, 4, 5, 6, 12, 13]
# Using mode for categorical variables
for col in cat_cols:
mode = df.iloc[:,col].dropna().mode()
if not mode.empty:
df.iloc[:, col] = df.iloc[:,col].fillna(mode.iloc[0])
continuous_cols = [1, 2, 7, 14]
# Using median for continuous variables
for col in continuous_cols:
median = df.iloc[:, col].median()
df.iloc[:, col] = df.iloc[:, col].fillna(median)
df.head()
| Gender | Age | Debt | Married | BankCustomer | EducationLevel | Ethnicity | YearsEmployed | PriorDefault | Employed | CreditScore | DriversLicense | Citizen | ZipCode | Income | ApprovalStatus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | b | 30.83 | 0.000 | u | g | w | v | 1.25 | t | t | 1 | f | g | 00202 | 0 | + |
| 1 | a | 58.67 | 4.460 | u | g | q | h | 3.04 | t | t | 6 | f | g | 00043 | 560 | + |
| 2 | a | 24.50 | 0.500 | u | g | q | h | 1.50 | t | f | 0 | f | g | 00280 | 824 | + |
| 3 | b | 27.83 | 1.540 | u | g | w | v | 3.75 | t | t | 5 | t | g | 00100 | 3 | + |
| 4 | b | 20.17 | 5.625 | u | g | w | v | 1.71 | t | f | 0 | f | s | 00120 | 0 | + |
df.dtypes
Gender object Age float64 Debt float64 Married object BankCustomer object EducationLevel object Ethnicity object YearsEmployed float64 PriorDefault object Employed object CreditScore int64 DriversLicense object Citizen object ZipCode object Income int64 ApprovalStatus object dtype: object
# Now, let's recode the values in the columns into numeric values so we can analyze data distributions
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns:
if df[col].dtypes=='object':
df[col] = df[col].astype(str)
df[col]=le.fit_transform(df[col])
df
| Gender | Age | Debt | Married | BankCustomer | EducationLevel | Ethnicity | YearsEmployed | PriorDefault | Employed | CreditScore | DriversLicense | Citizen | ZipCode | Income | ApprovalStatus | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 30.83 | 0.000 | 1 | 0 | 12 | 7 | 1.25 | 1 | 1 | 1 | 0 | 0 | 68 | 0 | 0 |
| 1 | 0 | 58.67 | 4.460 | 1 | 0 | 10 | 3 | 3.04 | 1 | 1 | 6 | 0 | 0 | 11 | 560 | 0 |
| 2 | 0 | 24.50 | 0.500 | 1 | 0 | 10 | 3 | 1.50 | 1 | 0 | 0 | 0 | 0 | 96 | 824 | 0 |
| 3 | 1 | 27.83 | 1.540 | 1 | 0 | 12 | 7 | 3.75 | 1 | 1 | 5 | 1 | 0 | 31 | 3 | 0 |
| 4 | 1 | 20.17 | 5.625 | 1 | 0 | 12 | 7 | 1.71 | 1 | 0 | 0 | 0 | 2 | 37 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 685 | 1 | 21.08 | 10.085 | 2 | 2 | 4 | 3 | 1.25 | 0 | 0 | 0 | 0 | 0 | 90 | 0 | 1 |
| 686 | 0 | 22.67 | 0.750 | 1 | 0 | 1 | 7 | 2.00 | 0 | 1 | 2 | 1 | 0 | 67 | 394 | 1 |
| 687 | 0 | 25.25 | 13.500 | 2 | 2 | 5 | 2 | 2.00 | 0 | 1 | 1 | 1 | 0 | 67 | 1 | 1 |
| 688 | 1 | 17.92 | 0.205 | 1 | 0 | 0 | 7 | 0.04 | 0 | 0 | 0 | 0 | 0 | 96 | 750 | 1 |
| 689 | 1 | 35.00 | 3.375 | 1 | 0 | 1 | 3 | 8.29 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
690 rows × 16 columns
# Print the unique values in each categorical column and the levels there are such as 2 levels for [0,1] and 4+ for education level
categorical_columns = ['Gender', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity',
'PriorDefault', 'Employed', 'DriversLicense', 'Citizen']
for col in categorical_columns:
print(f"Column {col}")
print("Levels:", df[col].unique())
print("\n")
Column Gender Levels: [1 0] Column Married Levels: [1 2 0] Column BankCustomer Levels: [0 2 1] Column EducationLevel Levels: [12 10 9 11 2 8 1 3 13 6 4 0 5 7] Column Ethnicity Levels: [7 3 0 2 4 8 6 1 5] Column PriorDefault Levels: [1 0] Column Employed Levels: [1 0] Column DriversLicense Levels: [0 1] Column Citizen Levels: [0 2 1]
# Great, this worked.
# Now we have data in each column that's numeric
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Gender | 690.0 | 0.695652 | 0.460464 | 0.00 | 0.000 | 1.00 | 1.0000 | 1.00 |
| Age | 690.0 | 31.514116 | 11.860245 | 13.75 | 22.670 | 28.46 | 37.7075 | 80.25 |
| Debt | 690.0 | 4.758725 | 4.978163 | 0.00 | 1.000 | 2.75 | 7.2075 | 28.00 |
| Married | 690.0 | 1.233333 | 0.430063 | 0.00 | 1.000 | 1.00 | 1.0000 | 2.00 |
| BankCustomer | 690.0 | 0.475362 | 0.850238 | 0.00 | 0.000 | 0.00 | 0.0000 | 2.00 |
| EducationLevel | 690.0 | 5.698551 | 4.285748 | 0.00 | 1.000 | 5.00 | 10.0000 | 13.00 |
| Ethnicity | 690.0 | 5.098551 | 2.510731 | 0.00 | 3.000 | 7.00 | 7.0000 | 8.00 |
| YearsEmployed | 690.0 | 2.223406 | 3.346513 | 0.00 | 0.165 | 1.00 | 2.6250 | 28.50 |
| PriorDefault | 690.0 | 0.523188 | 0.499824 | 0.00 | 0.000 | 1.00 | 1.0000 | 1.00 |
| Employed | 690.0 | 0.427536 | 0.495080 | 0.00 | 0.000 | 0.00 | 1.0000 | 1.00 |
| CreditScore | 690.0 | 2.400000 | 4.862940 | 0.00 | 0.000 | 0.00 | 3.0000 | 67.00 |
| DriversLicense | 690.0 | 0.457971 | 0.498592 | 0.00 | 0.000 | 0.00 | 1.0000 | 1.00 |
| Citizen | 690.0 | 0.176812 | 0.557869 | 0.00 | 0.000 | 0.00 | 0.0000 | 2.00 |
| ZipCode | 690.0 | 56.189855 | 46.386934 | 0.00 | 17.000 | 52.00 | 93.0000 | 169.00 |
| Income | 690.0 | 1017.385507 | 5210.102598 | 0.00 | 0.000 | 5.00 | 395.5000 | 100000.00 |
| ApprovalStatus | 690.0 | 0.555072 | 0.497318 | 0.00 | 0.000 | 1.00 | 1.0000 | 1.00 |
# Check for any last NA values
df.isnull().sum()
Gender 0 Age 0 Debt 0 Married 0 BankCustomer 0 EducationLevel 0 Ethnicity 0 YearsEmployed 0 PriorDefault 0 Employed 0 CreditScore 0 DriversLicense 0 Citizen 0 ZipCode 0 Income 0 ApprovalStatus 0 dtype: int64
# It looks like we have 0s in almost every column, even if that column is continuous
# Let's start cleaning the data
# Plotting all the variables to see where 0s need to be removed
plt.figure(figsize=(12, 8))
for i, col in enumerate(df, start=1):
plt.subplot(4, 4, i)
sns.histplot(data=df, x=col, kde=True)
plt.tight_layout()
plt.show()
We decided not to remove zeros from the code. We already removed NA values, so we assume that the 0s in the categorical variables are categories rather than NAs.
In addition, since we're a community bank, we expect to see many zeros in years employed, as our clients may not have had traditional employment. In addition, the zip code data have been anonymized, so the number of 0s in this column is not concerning. We also expect some applicants to have a zero income, as this indicates they are unemployed.
# Starting with initial data set
X= df.drop(["Age", "Gender","DriversLicense", "ZipCode", "Ethnicity", "Citizen", "ApprovalStatus"], axis=1)
y=df["ApprovalStatus"]
# Scaling the data because there are different units and scales, as seen above
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
X_scaled = Scaler.fit_transform(X)
# Convert the scaled features back to a DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
df_subset = pd.concat([df_scaled, y], axis=1)
# Check the first few rows of the new DataFrame
df_subset.head()
| Debt | Married | BankCustomer | EducationLevel | YearsEmployed | PriorDefault | Employed | CreditScore | Income | ApprovalStatus | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.956613 | -0.54295 | -0.559499 | 1.471393 | -0.291083 | 0.95465 | 1.157144 | -0.288101 | -0.195413 | 0 |
| 1 | -0.060051 | -0.54295 | -0.559499 | 1.004392 | 0.244190 | 0.95465 | 1.157144 | 0.740830 | -0.087852 | 0 |
| 2 | -0.856102 | -0.54295 | -0.559499 | 1.004392 | -0.216324 | 0.95465 | -0.864196 | -0.493887 | -0.037144 | 0 |
| 3 | -0.647038 | -0.54295 | -0.559499 | 1.471393 | 0.456505 | 0.95465 | 1.157144 | 0.535044 | -0.194837 | 0 |
| 4 | 0.174141 | -0.54295 | -0.559499 | 1.471393 | -0.153526 | 0.95465 | -0.864196 | -0.493887 | -0.195413 | 0 |
# Checking our work
plt.figure(figsize=(12, 8))
for i, col in enumerate(df_scaled, start=1):
plt.subplot(3,3, i)
sns.histplot(data=df_scaled, x=col, kde=True)
plt.tight_layout()
plt.show()
# This looks much better
import matplotlib.pyplot as plt
import seaborn as sns
# Define numerical columns based on df_subset columns
numerical_cols = ['Debt', 'Married', 'BankCustomer', 'EducationLevel', 'YearsEmployed',
'PriorDefault', 'Employed', 'CreditScore', 'Income']
# Plot histograms
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, start=1):
plt.subplot(3, 3, i)
sns.histplot(data=df, x=col, kde=True)
plt.tight_layout()
plt.show()
df_subset.columns
Index(['Debt', 'Married', 'BankCustomer', 'EducationLevel', 'YearsEmployed',
'PriorDefault', 'Employed', 'CreditScore', 'Income', 'ApprovalStatus'],
dtype='object')
import matplotlib.pyplot as plt
import seaborn as sns
# Define categorical columns
categorical_cols = ['Married', 'BankCustomer', 'EducationLevel', 'PriorDefault', 'Employed']
# Plot bar plots for categorical variables
plt.figure(figsize=(12,8))
for i, col in enumerate(categorical_cols, start=1):
plt.subplot(2,3, i)
sns.countplot(data=df, x=col)
plt.title(col)
plt.tight_layout()
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate the correlation matrix of the full dataframe
corr_matrix = df.corr()
# Visualize the correlation matrix
plt.figure(figsize=(10, 8)) # Adjust the size as needed
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar=True)
plt.title('Correlation Matrix of Remaining Fields')
plt.show()
print(df_subset.dtypes)
Debt float64 Married float64 BankCustomer float64 EducationLevel float64 YearsEmployed float64 PriorDefault float64 Employed float64 CreditScore float64 Income float64 ApprovalStatus int64 dtype: object
# Looking at our subset of data we've decided to include in the models
correlation_matrix = df_subset.corr()
print(correlation_matrix)
Debt Married BankCustomer EducationLevel \
Debt 1.000000 -0.091526 -0.079364 0.023373
Married -0.091526 1.000000 0.982257 -0.049977
BankCustomer -0.079364 0.982257 1.000000 -0.055812
EducationLevel 0.023373 -0.049977 -0.055812 1.000000
YearsEmployed 0.298902 -0.080624 -0.073064 0.037001
PriorDefault 0.244317 -0.129863 -0.142094 0.109642
Employed 0.174846 -0.162464 -0.173199 0.128549
CreditScore 0.271207 -0.106457 -0.112750 0.006978
Income 0.123121 -0.120065 -0.025170 0.004808
ApprovalStatus -0.206294 0.194306 0.185134 -0.130434
YearsEmployed PriorDefault Employed CreditScore Income \
Debt 0.298902 0.244317 0.174846 0.271207 0.123121
Married -0.080624 -0.129863 -0.162464 -0.106457 -0.120065
BankCustomer -0.073064 -0.142094 -0.173199 -0.112750 -0.025170
EducationLevel 0.037001 0.109642 0.128549 0.006978 0.004808
YearsEmployed 1.000000 0.345689 0.222982 0.322330 0.051345
PriorDefault 0.345689 1.000000 0.432032 0.379532 0.090012
Employed 0.222982 0.432032 1.000000 0.571498 0.077652
CreditScore 0.322330 0.379532 0.571498 1.000000 0.063692
Income 0.051345 0.090012 0.077652 0.063692 1.000000
ApprovalStatus -0.322475 -0.720407 -0.458301 -0.406410 -0.175657
ApprovalStatus
Debt -0.206294
Married 0.194306
BankCustomer 0.185134
EducationLevel -0.130434
YearsEmployed -0.322475
PriorDefault -0.720407
Employed -0.458301
CreditScore -0.406410
Income -0.175657
ApprovalStatus 1.000000
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
# It looks like married and BankCustomer are perfectly correlated, so we'll dropped "Married" as well.
# Banks are also not allowed to use marital status to make credit decisions
print(y_train.value_counts())
1 306 0 246 Name: ApprovalStatus, dtype: int64
print(y_train.unique())
[0 1]
# Cleaning our dataframe to contain the final values
df_original = df
cols_to_keep = ['Debt', 'BankCustomer', 'EducationLevel', 'YearsEmployed', 'PriorDefault',
'Employed', 'CreditScore', 'Income', 'ApprovalStatus']
df = df[cols_to_keep]
df
| Debt | BankCustomer | EducationLevel | YearsEmployed | PriorDefault | Employed | CreditScore | Income | ApprovalStatus | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000 | 0 | 12 | 1.25 | 1 | 1 | 1 | 0 | 0 |
| 1 | 4.460 | 0 | 10 | 3.04 | 1 | 1 | 6 | 560 | 0 |
| 2 | 0.500 | 0 | 10 | 1.50 | 1 | 0 | 0 | 824 | 0 |
| 3 | 1.540 | 0 | 12 | 3.75 | 1 | 1 | 5 | 3 | 0 |
| 4 | 5.625 | 0 | 12 | 1.71 | 1 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 685 | 10.085 | 2 | 4 | 1.25 | 0 | 0 | 0 | 0 | 1 |
| 686 | 0.750 | 0 | 1 | 2.00 | 0 | 1 | 2 | 394 | 1 |
| 687 | 13.500 | 2 | 5 | 2.00 | 0 | 1 | 1 | 1 | 1 |
| 688 | 0.205 | 0 | 0 | 0.04 | 0 | 0 | 0 | 750 | 1 |
| 689 | 3.375 | 0 | 1 | 8.29 | 0 | 0 | 0 | 0 | 1 |
690 rows × 9 columns
from sklearn.tree import DecisionTreeClassifier
# Define features (X) and target variable (y)
X = df.drop('ApprovalStatus', axis=1)
y = df['ApprovalStatus']
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
clf.fit(x_train, y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0)
import pydotplus
from IPython.display import Image
# Adding this line myself
from sklearn import tree
# Resuming to code that was in the notebook
dot_data = tree.export_graphviz(clf, feature_names=x_train.columns
, class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
print(y_train.value_counts())
1 306 0 246 Name: ApprovalStatus, dtype: int64
clf_2 = DecisionTreeClassifier(criterion='entropy',
random_state=0)
clf = clf_2.fit(x_train, y_train)
dot_data = tree.export_graphviz(clf_2, feature_names=x_train.columns
, class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# This clearly overfits the data
clf_3 = DecisionTreeClassifier(criterion='entropy', max_depth = 5,
random_state=0)
clf = clf_3.fit(x_train, y_train)
dot_data = tree.export_graphviz(clf_3, feature_names=x_train.columns
, class_names=['approved','not approved'],filled=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# Model statistics
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_train_pred = clf.predict(x_train)
# Calculate training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy: ", train_accuracy)
# Predict on the test set
y_test_pred = clf.predict(x_test)
# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy: ", test_accuracy)
# Calculate precision and recall for the test set
# Note: Adjust the `average` parameter based on your problem (binary, multi-class, etc.)
precision = precision_score(y_test, y_test_pred, average='binary') # for binary classification
recall = recall_score(y_test, y_test_pred, average='binary') # for binary classification
print("Precision: ", precision)
print("Recall: ", recall)
Training Accuracy: 0.8894927536231884 Test Accuracy: 0.8478260869565217 Precision: 0.9117647058823529 Recall: 0.8051948051948052
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Make predictions
y_pred = clf.predict(x_test)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
param_grid = {
'n_estimators': np.arange(50, 201, 50),
'max_depth': [None] + list(np.arange(5, 30, 5)),
'min_samples_split': np.arange(2, 11, 2),
'min_samples_leaf': np.arange(1, 11, 2),
'bootstrap': [True, False],
}
rf = RandomForestClassifier(random_state=0)
# Configure Grid Search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
# Fit Grid Search to the data
grid_search.fit(X, y)
# Best parameters found
print("Best parameters found: ", grid_search.best_params_)
# Print the top 5 parameter combinations
print("\nTop 5 parameter combinations:")
# Convert cv_results_ to a DataFrame for easier manipulation
results_df = pd.DataFrame(grid_search.cv_results_)
# Sort the results by 'rank_test_score' and select the top 5
top_5_results = results_df.sort_values(by='rank_test_score').head(5)
# Iterate over the rows of the top 5 results and print the parameters and their corresponding mean test score
for index, row in top_5_results.iterrows():
print(f"Rank: {row['rank_test_score']}, Mean Test Score: {row['mean_test_score']:.4f}, Params: {row['params']}")
Fitting 5 folds for each of 1200 candidates, totalling 6000 fits
Best parameters found: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
Top 5 parameter combinations:
Rank: 1, Mean Test Score: 0.8551, Params: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
Rank: 2, Mean Test Score: 0.8536, Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': False, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': False, 'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}
Rank: 3, Mean Test Score: 0.8522, Params: {'bootstrap': True, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}
# Still using separate train-test split for a final evaluation:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.2, random_state=0, stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((483, 8), (207, 8), (483,), (207,))
X
| Debt | BankCustomer | EducationLevel | YearsEmployed | PriorDefault | Employed | CreditScore | Income | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.000 | 0 | 12 | 1.25 | 1 | 1 | 1 | 0 |
| 1 | 4.460 | 0 | 10 | 3.04 | 1 | 1 | 6 | 560 |
| 2 | 0.500 | 0 | 10 | 1.50 | 1 | 0 | 0 | 824 |
| 3 | 1.540 | 0 | 12 | 3.75 | 1 | 1 | 5 | 3 |
| 4 | 5.625 | 0 | 12 | 1.71 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 685 | 10.085 | 2 | 4 | 1.25 | 0 | 0 | 0 | 0 |
| 686 | 0.750 | 0 | 1 | 2.00 | 0 | 1 | 2 | 394 |
| 687 | 13.500 | 2 | 5 | 2.00 | 0 | 1 | 1 | 1 |
| 688 | 0.205 | 0 | 0 | 0.04 | 0 | 0 | 0 | 750 |
| 689 | 3.375 | 0 | 1 | 8.29 | 0 | 0 | 0 | 0 |
690 rows × 8 columns
rf = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split= 4, min_samples_leaf=1, max_depth=5, bootstrap=True)
rf.fit(X_train, y_train)
test_score = rf.score(X_test, y_test)
print("Final Test Set Accuracy: ", test_score)
Final Test Set Accuracy: 0.8888888888888888
# variable (feature) importance plot
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
df_scores = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
df_scores = df_scores.sort_values('importance')
print(df_scores)
feature importance std 1 BankCustomer 0.016046 0.020176 2 EducationLevel 0.048985 0.046039 0 Debt 0.077667 0.051225 5 Employed 0.089832 0.121144 3 YearsEmployed 0.094340 0.087963 7 Income 0.132432 0.117848 6 CreditScore 0.133783 0.141691 4 PriorDefault 0.406915 0.216781
import matplotlib.pyplot as plt
ax = df_scores.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')
plt.show()
# confusion matrix for train set
dmba.classificationSummary(y_train, rf.predict(X_train))
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
# Predict on the test set
y_pred = rf.predict(X_train)
# Generate confusion matrix
conf_matrix = confusion_matrix(y_train, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# accuracy
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy: ", accuracy)
# precision
precision = precision_score(y_train, y_pred, average='binary')
print("Precision: ", precision)
# Calculate recall
recall = recall_score(y_train, y_pred, average='binary')
print("Recall: ", recall)
Confusion Matrix (Accuracy 0.8986)
Prediction
Actual 0 1
0 190 25
1 24 244
Confusion Matrix:
[[190 25]
[ 24 244]]
Accuracy: 0.8985507246376812
Precision: 0.9070631970260223
Recall: 0.9104477611940298
# confusion matrix for TEST set
dmba.classificationSummary(y_test, rf.predict(X_test))
# Predict on the test set
y_pred = rf.predict(X_test)
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
# precision
precision = precision_score(y_test, y_pred, average='binary')
print("Precision: ", precision)
# Calculate recall
recall = recall_score(y_test, y_pred, average='binary')
print("Recall: ", recall)
Confusion Matrix (Accuracy 0.8889)
Prediction
Actual 0 1
0 81 11
1 12 103
Confusion Matrix:
[[ 81 11]
[ 12 103]]
Accuracy: 0.8888888888888888
Precision: 0.9035087719298246
Recall: 0.8956521739130435
We want to make sure that features like ethnicity, gender are not biased in our data. We excluded these columns for legal reasons, but we need to ensure that our predicted approvals are not biased against these protected classes.
# merge predictions with broader set of original DF values to compare
predictions = rf.predict(X_test)
# Create a DataFrame from indices_test and predictions
predictions_df = pd.DataFrame({'ID': indices_test, 'Prediction': predictions})
# Merge this DataFrame with the original to include sensitive attributes
merged_df = pd.merge(df_original, predictions_df, left_index=True, right_index=True, how='left')
merged_df.head(5)
| Gender | Age | Debt | Married | BankCustomer | EducationLevel | Ethnicity | YearsEmployed | PriorDefault | Employed | CreditScore | DriversLicense | Citizen | ZipCode | Income | ApprovalStatus | ID | Prediction | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 30.83 | 0.000 | 1 | 0 | 12 | 7 | 1.25 | 1 | 1 | 1 | 0 | 0 | 68 | 0 | 0 | 194.0 | 0.0 |
| 1 | 0 | 58.67 | 4.460 | 1 | 0 | 10 | 3 | 3.04 | 1 | 1 | 6 | 0 | 0 | 11 | 560 | 0 | 132.0 | 0.0 |
| 2 | 0 | 24.50 | 0.500 | 1 | 0 | 10 | 3 | 1.50 | 1 | 0 | 0 | 0 | 0 | 96 | 824 | 0 | 337.0 | 1.0 |
| 3 | 1 | 27.83 | 1.540 | 1 | 0 | 12 | 7 | 3.75 | 1 | 1 | 5 | 1 | 0 | 31 | 3 | 0 | 271.0 | 1.0 |
| 4 | 1 | 20.17 | 5.625 | 1 | 0 | 12 | 7 | 1.71 | 1 | 0 | 0 | 0 | 2 | 37 | 0 | 0 | 579.0 | 0.0 |
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Gender')['Prediction'].mean()
# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Gender')['Prediction'].count()
# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
'Application Count': application_counts,
'Approval Rate': approval_rates
})
# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100
print("Approval rates and application counts by Gender:")
print(summary_df)
Approval rates and application counts by Gender:
Application Count Approval Rate
Gender
0 69 56.521739
1 138 54.347826
import scipy.stats as stats
merged_df = merged_df.dropna()
groups = [group['Prediction'].values for name, group in merged_df.groupby('Gender')]
# Conduct ANOVA
anova_result = stats.f_oneway(*groups)
print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")
if anova_result.pvalue < 0.05:
print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 0.0870, p-value: 0.7683 We fail to reject the null hypothesis - no significant difference in approval rates between groups.
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Ethnicity')['Prediction'].mean()
# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Ethnicity')['Prediction'].count()
# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
'Application Count': application_counts,
'Approval Rate': approval_rates
})
# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100
print("Approval rates and application counts by Ethnicity:")
print(summary_df)
Approval rates and application counts by Ethnicity:
Application Count Approval Rate
Ethnicity
0 24 54.166667
2 4 75.000000
3 60 56.666667
4 2 100.000000
7 114 53.508772
8 3 33.333333
#Running an ANOVA test to see if there are any statistically significant differences between groups.
groups = [group['Prediction'].values for name, group in merged_df.groupby('Ethnicity')]
# Conduct ANOVA
anova_result = stats.f_oneway(*groups)
print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")
if anova_result.pvalue < 0.05:
print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 0.5969, p-value: 0.7023 We fail to reject the null hypothesis - no significant difference in approval rates between groups.
# Calculate approval rates by Gender
approval_rates = merged_df.groupby('Citizen')['Prediction'].mean()
# Calculate counts of applications by Gender
application_counts = merged_df.groupby('Citizen')['Prediction'].count()
# Combine both metrics into a single DataFrame
summary_df = pd.DataFrame({
'Application Count': application_counts,
'Approval Rate': approval_rates
})
# Convert approval rate to percentage for better readability
summary_df['Approval Rate'] = summary_df['Approval Rate'] * 100
print("Approval rates and application counts by Citizenship:")
print(summary_df)
Approval rates and application counts by Citizenship:
Application Count Approval Rate
Citizen
0 192 56.770833
1 1 0.000000
2 14 35.714286
groups = [group['Prediction'].values for name, group in merged_df.groupby('Citizen')]
# Conduct ANOVA
anova_result = stats.f_oneway(*groups)
print(f"ANOVA F-statistic: {anova_result.statistic:.4f}, p-value: {anova_result.pvalue:.4f}")
if anova_result.pvalue < 0.05:
print("We reject the null hypothesis - there are significant differences in approval rates between groups.")
else:
print("We fail to reject the null hypothesis - no significant difference in approval rates between groups.")
ANOVA F-statistic: 1.7900, p-value: 0.1696 We fail to reject the null hypothesis - no significant difference in approval rates between groups.
from sklearn.model_selection import train_test_split
# Separate features (X) and target variable (y) of the scaled subset
X = df_subset.drop(['ApprovalStatus', 'Married'], axis=1) # Features
y = df_subset['ApprovalStatus'] # Target variable
print(X.head())
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# Display the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
Debt BankCustomer EducationLevel YearsEmployed PriorDefault \ 0 -0.956613 -0.559499 1.471393 -0.291083 0.95465 1 -0.060051 -0.559499 1.004392 0.244190 0.95465 2 -0.856102 -0.559499 1.004392 -0.216324 0.95465 3 -0.647038 -0.559499 1.471393 0.456505 0.95465 4 0.174141 -0.559499 1.471393 -0.153526 0.95465 Employed CreditScore Income 0 1.157144 -0.288101 -0.195413 1 1.157144 0.740830 -0.087852 2 -0.864196 -0.493887 -0.037144 3 1.157144 0.535044 -0.194837 4 -0.864196 -0.493887 -0.195413 X_train shape: (552, 8) X_test shape: (138, 8) y_train shape: (552,) y_test shape: (138,)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import xgboost as xgb # Importing the xgboost package
# Create a parameter distribution to use Random Search for hyperparameter tuning
param_dist = {
"max_depth":randint(1,10),
"min_child_weight": randint(1,5),
"gamma": randint(0,2),
"learning_rate": uniform(0.001, 0.2),
"n_estimators": randint(50,300),
"scale_pos_weight": randint(1,10),
"alpha" : uniform(0,100)
}
xgboost = xgb.XGBClassifier() # Creating an instance of XGBClassifier
random_search = RandomizedSearchCV(xgboost, param_distributions=param_dist, n_iter=200, cv=5, verbose=1, random_state=0)
random_search.fit(X_train, y_train)
print("Score:", random_search.best_score_)
print("Parameters:", random_search.best_params_)
bestXGB = random_search.best_estimator_
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Score: 0.8532350532350532
Parameters: {'alpha': 47.360041934665745, 'gamma': 0, 'learning_rate': 0.15584673788684333, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 84, 'scale_pos_weight': 1}
print('Training dataset performance:')
dmba.classificationSummary(y_train, bestXGB.predict(X_train))
print('\nTest dataset performance:')
dmba.classificationSummary(y_test, bestXGB.predict(X_test))
Training dataset performance:
Confusion Matrix (Accuracy 0.8533)
Prediction
Actual 0 1
0 226 20
1 61 245
Test dataset performance:
Confusion Matrix (Accuracy 0.8623)
Prediction
Actual 0 1
0 58 3
1 16 61
# Print the precision and recall scores
precision = precision_score(y_test, bestXGB.predict(X_test))
print("Precision: ", precision)
# Calculate recall
recall = recall_score(y_test, bestXGB.predict(X_test))
print("Recall: ", recall)
Precision: 0.953125 Recall: 0.7922077922077922
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
# Initialize the logistic regression model
lr = LogisticRegression()
# Defining a parameter grid to search over
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
'penalty': ['l1', 'l2'],
'solver': ['liblinear']
}
# Initialize GridSearchCV with the logistic regression model, parameter grid, and cv parameter
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc', error_score='raise')
# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)
# Print the best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)
# Instead of creating a new LogisticRegression model, use grid_search.best_estimator_ directly
# This is the model with the best parameters already fitted to the training data
best_lr = grid_search.best_estimator_
# Use the best model found by GridSearchCV to make predictions on the test set
y_pred = best_lr.predict(X_test)
y_pred_train = best_lr.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)
# Print the accuracy on the test set
print("Training set accuracy:", train_accuracy)
print("Test set accuracy:", test_accuracy)
Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Training set accuracy: 0.8677536231884058
Test set accuracy: 0.8768115942028986
from sklearn.metrics import confusion_matrix
# y_pred contains model predictions and y_test contains the true labels
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[57 4] [13 64]]
best_model = grid_search.best_estimator_
# Coefficients of the best model
coefficients = best_model.coef_
feature_names = X.columns
# Print the coefficients for each feature for each class
print("\nCoefficients for each class against each feature:")
for index, class_coefficients in enumerate(coefficients):
print(f"Class {index}:")
for feature, coef in zip(feature_names, class_coefficients):
print(f"{feature}: {coef}")
Coefficients for each class against each feature: Class 0: Debt: 0.08387356320869674 BankCustomer: 0.3266207931197453 EducationLevel: -0.13856405553456244 YearsEmployed: -0.29557509653458 PriorDefault: -1.5797169045768271 Employed: -0.21709257430133705 CreditScore: -0.5108983504902503 Income: -1.6569334783397407
# Removing gender, debt, and education level
# Corresponds to indexes 0, 1, and 4
X_modified = np.delete(X_scaled, [0, 1, 4], axis=1)
print(X_modified)
# Recreating the sample
X_train, X_test, y_train, y_test = train_test_split(X_modified, y, test_size = 0.20, random_state = 42, stratify = y)
[[-0.55949891 1.47139336 0.95465038 1.15714435 -0.28810053 -0.19541334] [-0.55949891 1.00439179 0.95465038 1.15714435 0.74082993 -0.08785188] [-0.55949891 1.00439179 0.95465038 -0.86419641 -0.49388662 -0.03714433] ... [ 1.79449039 -0.16311214 -1.04750391 1.15714435 -0.28810053 -0.19522126] [-0.55949891 -1.33061608 -1.04750391 -0.86419641 -0.49388662 -0.05135781] [-0.55949891 -1.09711529 -1.04750391 -0.86419641 -0.49388662 -0.19541334]]
# Converting
lr = LogisticRegression()
# Defining a parameter grid to search over
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], # Regularization strength
'penalty': ['l1', 'l2'], # Norm used in the penalization
'solver': ['liblinear'] # Algorithm to use in the optimization problem, 'liblinear' works well for small datasets
}
# Initialize GridSearchCV with the logistic regression model, parameter grid, and cv parameter
# cv=5 specifies the number of folds for cross-validation
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='roc_auc', error_score= 'raise')
# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)
# Print the best parameters found by GridSearchCV
print("Best parameters:", grid_search.best_params_)
# Use the best model found by GridSearchCV to make predictions on the test set
y_pred = grid_search.predict(X_test)
y_pred_train = grid_search.predict(X_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred)
# Print the accuracy on the test set
print("Training set accuracy:", train_accuracy)
print("Test set accuracy:", test_accuracy)
Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Training set accuracy: 0.8858695652173914
Test set accuracy: 0.7971014492753623
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[52 9] [19 58]]
# precision
precision = precision_score(y_test, y_pred, average='binary')
print("Precision: ", precision)
# Calculate recall
recall = recall_score(y_test, y_pred, average='binary')
print("Recall: ", recall)
Precision: 0.8656716417910447 Recall: 0.7532467532467533
Note: please see separate submission for the neural network script